import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
sns.set(rc={'figure.figsize':(11.7,8.27)})
bankData = pd.read_csv('bank-full.csv')
bankData.head()
class UniVariateAnalysis:
def __init__(self, df, columnName):
self.columnName = columnName
self.dataframe = df
self.series = df[columnName]
def get_q1(self):
return self.series.quantile(.25)
def get_q2(self):
return self.series.quantile(.5)
def get_q3(self):
return self.series.quantile(.75)
def get_q4(self):
return self.series.quantile(1)
def get_iqr(self):
return self.get_q3() - self.get_q1()
def get_min(self):
return self.dataframe[self.columnName].min()
def get_median(self):
return self.dataframe[self.columnName].median()
def get_max(self):
return self.dataframe[self.columnName].max()
def get_data_type(self):
return self.dataframe[self.columnName].dtypes
def get_lower_outlier_rows(self):
return self.dataframe.loc[(self.dataframe[self.columnName] < self.get_lower_whisker_value())]
def get_lower_whisker_value(self):
return self.get_q1() - ((3/2) * self.get_iqr())
def get_higher_outlier_rows(self):
return self.dataframe.loc[(self.dataframe[self.columnName] > self.get_higher_whisker_value())]
def get_higher_whisker_value(self):
return self.get_q3() + ( (3/2) * self.get_iqr())
def get_std(self):
return self.series.std()
def get_mean(self):
return self.series.mean()
class UniVariateReport:
def __init__(self, uniVariateAnalysis):
self.analysis = uniVariateAnalysis
def print_quartiles(self):
print("Q1: " , self.analysis.get_q1())
print("Q2: ", self.analysis.get_q2())
print("Q3: ", self.analysis.get_q3())
print("Q4: ", self.analysis.get_q4())
print("Mean: ", self.analysis.get_mean())
print("Min: ", self.analysis.get_min())
print("Median: ", self.analysis.get_median())
print("Max: ", self.analysis.get_max())
def print_whiskers(self):
print("Top whisker: ", self.analysis.get_higher_whisker_value())
print("Bottom whisker: ", self.analysis.get_lower_whisker_value())
def print_data_type(self):
print("Data type: ", self.analysis.get_data_type())
def print_value_range(self):
print(f'Range of values: ({self.analysis.get_min()}, {self.analysis.get_max()})')
def print_std(self):
print("Standard deviation: ", self.analysis.get_std())
def print_report(self):
self.print_data_type()
self.print_value_range()
self.print_std()
self.print_quartiles()
self.print_whiskers()
ageAnalysis = UniVariateAnalysis(bankData, 'age')
ageAnalysisReport = UniVariateReport(ageAnalysis)
ageAnalysisReport.print_report()
ageAnalysis.get_higher_outlier_rows()
ageAnalysis.get_lower_outlier_rows()
bankData.describe()
sns.distplot(bankData['age'], bins=25)
sns.boxplot(bankData['age'])
#Bad data search
bankData.loc[(bankData['age'] < 18)]
bankData['job'].unique().size
bankData['job'].dtypes
bankData['job'].unique()
sns.countplot(x="job", data=bankData)
bankData['marital'].dtypes
bankData['marital'].unique().size
bankData['marital'].unique()
sns.countplot(x="marital", data=bankData)
bankData['education'].unique()
bankData['education'].unique().size
sns.countplot(x="education", data=bankData)
sns.countplot(x="default", data=bankData)
bankData['default'].unique()
balance_analysis = UniVariateAnalysis(bankData, 'balance')
balance_analysis_report = UniVariateReport(balance_analysis)
balance_analysis_report.print_report()
sns.distplot(bankData['balance'], bins=50)
sns.boxplot(bankData['balance'])
balance_analysis.get_higher_outlier_rows()
balance_analysis.get_lower_outlier_rows()
bankData['balance'].isnull().values.any()
sns.countplot(x="housing", data=bankData)
sns.countplot(x="loan", data=bankData)
sns.countplot(x="contact", data=bankData)
bankData['contact'].unique()
day_analysis = UniVariateAnalysis(bankData, 'day')
day_analysis_report = UniVariateReport(ageAnalysis)
day_analysis_report.print_report()
day_analysis.get_higher_outlier_rows()
day_analysis.get_lower_outlier_rows()
bankData['day'].isnull().values.any()
sns.countplot(x="day", data=bankData)
sns.boxplot(bankData['day'])
'mar', 'apr', 'sep']
sns.countplot(x="month", data=bankData)
bankData['month'].unique()
duration_analysis = UniVariateAnalysis(bankData, 'duration')
duration_analysis_report = UniVariateReport(duration_analysis)
duration_analysis_report.print_report()
bankData['duration'].isnull().values.any()
sns.distplot(bankData['duration'], bins=50)
sns.boxplot(bankData['duration'])
duration_analysis.get_higher_outlier_rows()
duration_analysis.get_lower_outlier_rows()
bankData['campaign'].isnull().values.any()
campaign_analysis = UniVariateAnalysis(bankData, 'campaign')
campaign_analysis_report = UniVariateReport(campaign_analysis)
campaign_analysis_report.print_report()
campaign_analysis.get_higher_outlier_rows()
campaign_analysis.get_lower_outlier_rows()
sns.boxplot(bankData['campaign'])
sns.distplot(bankData['campaign'], bins=40)
bankData['campaign'].unique()
bankData['pdays'].isnull().values.any()
pdays_analysis = UniVariateAnalysis(bankData, 'pdays')
pdays_analysis_report = UniVariateReport(pdays_analysis)
pdays_analysis_report.print_report()
pdays_analysis.get_lower_outlier_rows()
pdays_analysis.get_higher_outlier_rows()
bankData.loc[bankData['pdays'] < 0]
#invalid
bankData.loc[(bankData['pdays'] < -1) | bankData['pdays'] > 900]
sns.distplot(bankData['pdays'], bins=100)
sns.boxplot(bankData['pdays'])
previous_analysis = UniVariateAnalysis(bankData, 'previous')
previous_analysis_report = UniVariateReport(previous_analysis)
previous_analysis_report.print_report()
bankData['previous'].isnull().values.any()
sns.distplot(bankData['previous'], bins=100)
sns.boxplot(bankData['previous'])
previous_analysis.get_higher_outlier_rows()
previous_analysis.get_lower_outlier_rows()
bankData['poutcome'].unique()
sns.countplot(x="poutcome", data=bankData)
bankDataWithIntTarget = bankData.copy()
bankDataWithIntTarget['Target'] = bankDataWithIntTarget['Target'].replace(['yes','no'],[1,0])
bankDataWithIntTarget
Column really doesnt seem to show much relationship between age and target,on the count plot we can see that many more people have 'no', but the increase of both 'yes' and 'no' seem to be congruent to each other as age increases then decreases. On teh scatterplot there doesnt seem to be any dots that are skewed towards one side of the graph, its looks like its oscillating
sns.scatterplot(bankData['age'], bankData['Target']) # Plots the scatter plot using two variables
sns.countplot(x="age", hue="Target", data=bankData)
No immediate relationship here - we may see some relationship between if the client will subscribe if they are a student, but unforatunately, that professions seems to less frequent in the data. All other professions more or less look like they have the same ration of yes to no.
sns.countplot(x="job", hue="Target", data=bankData)
Same thing as above here, there doesnt seem to be a huge difference between whether or not the client subscribed and the marital status of the person
sns.countplot(x="marital", hue="Target", data=bankData)
Here we go - there seems to be a greater ration of people that are subscribed if they have secondary education or tertiary education vs the other ones. Unknown looks to have the lowest, which we predicted in question 1.
sns.countplot(x="education", hue="Target", data=bankData)
Easy to see here that there is a very big correlation with people who do not default on a loan and people who subscribe to term deposits
sns.countplot(x="default", hue="Target", data=bankData)
Looks to be like there is a strong relationship between balance of an account and whether or not that person is subscribed to term deposits. Looks like rich people may not subscribe to term deposits as much (possibly due to other lucrative investments they take)
sns.lineplot(x='balance', y='Target', data=bankDataWithIntTarget)
semi strong relationship between people having no housing loan and deciding to subscribe to a term deposit. We can see that even though we see less people that do not have a housing loan, there are still more people total that also have a subscription to term deposits versus people who do have a housing loan and subscribe to term deposits
sns.countplot(x="housing", hue="Target", data=bankData)
Hard to tell here if there is a relationship if any at all, there may be a slight relationship between having a personal loan and not subscribing to term deposits.
sns.countplot(x="loan", hue="Target", data=bankData)
Looks to be a semi strong relationship between people with cellular contact and whether or not they have a housing loan. Maybe due to the fact that people with cell phones may receieve a different type of notification that better convinces them to subscribe to a deposit term, versus a telephone where the user may just get annoyed at being called. Unknown may be low because its hard to contact someone to market towards them without a line of contact
sns.countplot(x="contact", hue="Target", data=bankData)
Difficult to tell just from this graph without considering the month column but it appears that around the beginning and 10th of each month is the highest. Maybe this is due to the fact that people are getting paid around this time and may feel more confident to subscribe to a deposit term
sns.lineplot(x='day', y='Target', data=bankDataWithIntTarget)
Again, like day, its hard to draw any conclusions logically but it does appear to be a strong relationship between concacting the customer in Dec and March and sep -oct and if the person will subscribe to a term deposit. This could be due to the fact that many employers give employees christmas bonuses, while others (like my employer) wait until around mar to give them out
bank_df_int_target_int_month = bankDataWithIntTarget.copy()
bank_df_int_target_int_month['month'] = bank_df_int_target_int_month['month'].replace(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],[1,2,3,4,5,6,7,8,9,10,11,12])
sns.lineplot(x='month', y='Target', data=bank_df_int_target_int_month)
there is a very strong relationship between duration and whether or not the person subscribes to term deposits - like we discussed earlier this may be because there is an average amount of time that is needed to explain the details of term deposits and that probably lies in the 1k-2k duration range. While people who make short communications with the bank may not ask many questions because they are simply not interested. Very long calls may end up being about something else or the person asking the questions may be asking so many questions because they are on the fence and may be attempting to 'talk themselves into it' or the bank employee may be trying to convince a person that may be slightly interested but on average doenst decide to subscribe.
sns.lineplot(x='duration', y='Target', data=bankDataWithIntTarget)
Seems to be a very strong correlation between people who have not been contacted that many times during this campaign and if they subscribed to term deposits. There seems to be a sweet spot where a customer will usually subscribe on the first thru 10th time, this may be because its easier to convince them while the more you call them, the less likely they are to subscribe (generally). If you keep asking a customer if they want to subscribe and they say no more than 10 times, then theyve most likely have had their mind made up.
sns.lineplot(x='campaign', y='Target', data=bankDataWithIntTarget)
Very difficullt to tell here if there is any relationship, the graph oscillates quite a bit, especially for people who have been contacted more than 400 days ago.
sns.lineplot(x='pdays', y='Target', data=bankDataWithIntTarget)
Like we said for campaign, seems to be a pretty strong relationship towards the beginning of the graph, possibly because it doenst take that many times to convince a person to subscribe if they are likely to subscribe. However, if the client has refused multiple times, it appears that the chance they finally say yes decreases.
sns.lineplot(x='previous', y='Target', data=bankDataWithIntTarget)
very strong relationship between those who did subscribe last time to them subscribing to term deposits now. however, the total data points for that scenario are relatively low but this may be a good indicator on whether or not the person will subscribe if theyve subscribed in the past.
sns.countplot(x="poutcome", hue="Target", data=bankData)
bankData.info()
def get_q1(df, columnName):
return df[columnName].quantile(.25)
def get_q2(df, columnName):
return df[columnName].quantile(.5)
def get_q3(df, columnName):
return df[columnName].quantile(.75)
def get_q4(df, columnName):
return df[columnName].quantile(1)
def get_iqr(df, columnName):
return get_q3(df, columnName) - get_q1(df, columnName)
def get_lower_whisker_value(df, columnName):
return get_q1(df, columnName) - ((3/2) * get_iqr(df, columnName))
def get_higher_whisker_value(df, columnName):
return get_q3(df, columnName) + ( (3/2) * get_iqr(df, columnName))
def get_df_without_lower_outliers(df, columnNames):
copy = df.copy()
for col in columnNames:
copy = copy[copy[col] >= get_lower_whisker_value(df, col)]
return copy
def get_df_without_higher_outliers(df, columnNames):
copy_df = df.copy()
for col in columnNames:
copy_df = copy_df[copy_df[col] <= get_higher_whisker_value(df, col)]
return copy_df
def get_df_without_outliers(df, columnNames):
copy_df = df.copy()
for col in columnNames:
copy_df = copy_df[(copy_df[col] <= get_higher_whisker_value(df, col)) & (copy_df[col] >= get_lower_whisker_value(df,col))]
return copy_df
#I'm going to change the values of the months from strings since we have an accurate numerical representation for those
prepared_bank_df = bankData.copy()
prepared_bank_df['month'] = prepared_bank_df['month'].replace(['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'],[1,2,3,4,5,6,7,8,9,10,11,12])
#lets change the columns default, housing, loan, & target from 'yes'/'no' to 1/0, respectively
prepared_bank_df['default'] = prepared_bank_df['default'].replace(['yes','no'],[1,0])
prepared_bank_df['housing'] = prepared_bank_df['housing'].replace(['yes','no'],[1,0])
prepared_bank_df['loan'] = prepared_bank_df['loan'].replace(['yes','no'],[1,0])
prepared_bank_df['Target'] = prepared_bank_df['Target'].replace(['yes','no'],[1,0])
#lets create dummy variables for job, marital, education, contact, & poutcome
prepared_bank_df_with_dummies = pd.get_dummies(prepared_bank_df)
prepared_bank_df_with_dummies
#lets remove some of the outliers that we discussed earlier
bank_df_with_dummies_no_outliers_prepared = get_df_without_outliers(prepared_bank_df_with_dummies, ['age','balance','duration','campaign','previous'])
bank_df_with_dummies_no_outliers_prepared
#lets create the training set and test set (70:30)
def GetTrainingAndTestData(df, targetColumn):
# drop the column we are trying to predict
X = df.drop(targetColumn, axis=1)
# specify the column that we are trying to predict
Y = df[targetColumn]
return train_test_split(X, Y, test_size=0.3, random_state=1)
bankData.head()
def GetDecisionTree(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
# def GetLogisticRegressionModel(df, targetColumn):
# # # drop the column we are trying to predict
# # X = df.drop(targetColumn, axis=1)
# # # specify the column that we are trying to predict
# # Y = df[targetColumn]
# # x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# x_train, x_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
# #we have a perfect split
# print("{0:0.2f}% data is in training set".format((len(x_train)/len(df.index)) * 100))
# print("{0:0.2f}% data is in test set".format((len(x_test)/len(df.index)) * 100))
# return LogisticRegression(solver="liblinear")
def PrintGradientBoosting(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
gradient = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gradient = gradient.fit(X_train, y_train)
y_predict = gradient.predict(X_test)
print("score: ", gradient.score(X_test, y_test))
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
print("Accuracy:", accuracy)
print("Precision:", recall)
print("Recall:", precision)
print("F1 Score: ", (precision * recall)/(precision + recall))
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
def PrintBoostingPerformance(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
boosting = AdaBoostClassifier(n_estimators=10, random_state=1)
boosting = boosting.fit(X_train, y_train)
y_predict = boosting.predict(X_test)
print("score:", boosting.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
print("Accuracy:", accuracy)
print("Precision:", recall)
print("Recall:", precision)
print("F1 Score: ", (precision * recall)/(precision + recall))
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
def PrintBaggingPerformance(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
print("bagging score:", bgcl.score(X_test , y_test))
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
print("Accuracy:", accuracy)
print("Precision:", recall)
print("Recall:", precision)
print("F1 Score: ", (precision * recall)/(precision + recall))
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
def PrintDecisionTreePerformance(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
print("training dtree score: ", dTree.score(X_train, y_train))
print("test dtree score: ", dTree.score(X_test, y_test))
print(dTree.score(X_test , y_test))
y_predict = dTree.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
# df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
# columns = [i for i in ["No","Yes"]])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
print("Accuracy:", accuracy)
print("Precision:", recall)
print("Recall:", precision)
print("F1 Score: ", (precision * recall)/(precision + recall))
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
def PrintLogisticRegressionPerformance(df, targetColumn):
# drop the column we are trying to predict
X = df.drop('Target', axis=1)
# specify the column that we are trying to predict
Y = df['Target']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
model_score = model.score(x_test, y_test)
print("model score: ", model_score)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
print("Accuracy:", accuracy)
print("Precision:", recall)
print("Recall:", precision)
print("F1 Score: ", (precision * recall)/(precision + recall))
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
def GetGradientMetrics(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
gradient = GradientBoostingClassifier(n_estimators = 50,random_state=1)
gradient = gradient.fit(X_train, y_train)
y_predict = gradient.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
score = gradient.score(X_test, y_test)
f1 = (precision * recall)/(precision + recall)
return accuracy, recall, precision, score, f1
def GetBoostingMetrics(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
boosting = AdaBoostClassifier(n_estimators=10, random_state=1)
boosting = boosting.fit(X_train, y_train)
y_predict = boosting.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
score = boosting.score(X_test , y_test)
f1 = (precision * recall)/(precision + recall)
return accuracy, recall, precision, score, f1
def GetBaggingMetrics(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
bgcl = bgcl.fit(X_train, y_train)
y_predict = bgcl.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
score = bgcl.score(X_test , y_test)
f1 = (precision * recall)/(precision + recall)
return accuracy, recall, precision, score, f1
def GetDecisionTreeMetrics(df, targetColumn):
X_train, X_test, y_train, y_test = GetTrainingAndTestData(df, targetColumn)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
y_predict = dTree.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
# df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
# columns = [i for i in ["No","Yes"]])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
score = dTree.score(X_test , y_test)
f1 = (precision * recall)/(precision + recall)
return accuracy, recall, precision, score, f1
def GetLogisticRegressionMetrics(df, targetColumn):
# drop the column we are trying to predict
X = df.drop('Target', axis=1)
# specify the column that we are trying to predict
Y = df['Target']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
model_score = model.score(x_test, y_test)
cm=metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
TP = df_cm['Predict 1'][0]
FP = df_cm['Predict 1'][1]
FN = df_cm['Predict 0'][0]
TN = df_cm['Predict 0'][1]
accuracy = (TP + TN)/(TP+TN+FP+FN)
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
score = model_score
f1 = (precision * recall)/(precision + recall)
return accuracy, recall, precision, score, f1
def GetMetricDataframe(df, columnName):
gradient_metrics = GetGradientMetrics(df, columnName)
boosting_metrics = GetBoostingMetrics(df, columnName)
bagging_metrics = GetBaggingMetrics(df, columnName)
dtree_metrics = GetDecisionTreeMetrics(df, columnName)
log_metrics = GetLogisticRegressionMetrics(df, columnName)
i = ['gradient', 'boosting', 'bagging', 'dtree', 'logistic']
data = {
'accuracy': [gradient_metrics[0],boosting_metrics[0],bagging_metrics[0],dtree_metrics[0],log_metrics[0]],
'recall': [gradient_metrics[1],boosting_metrics[1],bagging_metrics[1],dtree_metrics[1],log_metrics[1]],
'precision': [gradient_metrics[2],boosting_metrics[2],bagging_metrics[2],dtree_metrics[2],log_metrics[2]],
'score': [gradient_metrics[3],boosting_metrics[3],bagging_metrics[3],dtree_metrics[3],log_metrics[3]],
'f1': [gradient_metrics[4],boosting_metrics[4],bagging_metrics[4],dtree_metrics[4],log_metrics[4]],
}
return pd.DataFrame(data, columns = ['accuracy', 'recall', 'precision', 'score', 'f1'], index=i)
PrintLogisticRegressionPerformance(bank_df_with_dummies_no_outliers_prepared, 'Target')
PrintDecisionTreePerformance(bank_df_with_dummies_no_outliers_prepared, 'Target')
PrintBaggingPerformance(bank_df_with_dummies_no_outliers_prepared, 'Target')
PrintBoostingPerformance(bank_df_with_dummies_no_outliers_prepared, 'Target')
PrintGradientBoosting(bank_df_with_dummies_no_outliers_prepared, 'Target')
GetMetricDataframe(bank_df_with_dummies_no_outliers_prepared, 'Target')
The bagging method seems to be the best due to its high accuracy and precison & score